In this lecture we talk about our first batch of new optimization tricks that can help us overcome common problems encountered when optimizing nonlinear supervised learners like neural networks.
Press the button 'Toggle code' below to toggle code on and off for entire this presentation.
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)
# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
- Slowing down on a convex function (or any long narrow valley) due to a vanishing gradient
- Getting caught in the non-minimum stationary point (e.g., a saddle point) of a non-convex function
# what function should we play with? Defined in the next line.
g = lambda w: np.maximum(0,(3*w - 2.3)**3 + 1)**2 + np.maximum(0, (-3*w + 0.7)**3 + 1)**2
# run the visualizer for our chosen input function, initial point, and step length alpha
demo = optlib.gradient_descent_demos.visualizer();
demo.animate_2d(g=g, w_init = 0,steplength = 0.01,max_its = 55,version = 'normalized',wmin = 0,wmax = 1)
# what function should we play with? Defined in the next line.
g = lambda w: np.maximum(0,(3*w - 2.3)**3 + 1)**2 + np.maximum(0, (-3*w + 0.7)**3 + 1)**2
# run the visualizer for our chosen input function, initial point, and step length alpha
demo = optlib.gradient_descent_demos.visualizer();
demo.compare_versions_2d(g=g, w_init = 0,steplength = 0.01,max_its = 80,version = 'normalized',wmin = 0,wmax = 1)
# plot cost function histories
histories = [weight_history_1,weight_history_2]
compare_regression_histories(histories,least_squares)
# create figure and plot data
fig, ax = plt.subplots(1, 1, figsize=(6,3))
ax.scatter(x,y,color = 'k',edgecolor = 'w')
# fit regression model to data
w1 = weight_history_1[-1]
w2 = weight_history_2[-1]
# make fits
x_vals = np.linspace(min(x),max(x),200)
y_vals_1 = [predict(v,w1) for v in x_vals]
y_vals_2 = [predict(v,w2) for v in x_vals]
# plot it
ax.plot(x_vals,y_vals_1,color = 'blue')
ax.plot(x_vals,y_vals_2,color = 'orange')
plt.show()

where $a = 0$, $\mathbf{b} = \begin{bmatrix} 1 \\ 1 \end{bmatrix}$, and $\mathbf{C} = \begin{bmatrix} 1\,\,0 \\ 0 \,\, 12\end{bmatrix}$
# visualize
import contour_run_comparison
demo = contour_run_comparison.Visualizer()
demo.show_paths(g, weight_history_1,weight_history_4,num_contours = 20)
where $a = 0$, $\mathbf{b} = \begin{bmatrix} 1 \\ 1 \end{bmatrix}$, and $\mathbf{C} = \begin{bmatrix} 0\,\,0 \\ 0 \,\, 12\end{bmatrix}$
# visualize
demo = contour_run_comparison.Visualizer()
demo.show_paths(g, weight_history_1,weight_history_2,num_contours = 20)
import classification_2d_demos_v2
# create instance of logisic regression demo and load in data, cost function, and descent history
demo3 = classification_2d_demos_v2.Visualizer(data,tanh_least_squares)
# animate descent process
demo3.animate_runs(weight_history_1,weight_history_4,num_contours = 25)